Google 1-gram Data

I wanted to do another project on time. And so I flirted with the google n-gram dataset.

In [10]:
using PyPlot
In [11]:
X = readdlm("../data/T.dlm")
X = X./sum(X,2);
X = X./maximum(X,1)
X = X*5;

Words = readdlm("../data/Words.dlm");
In [12]:
(U,S,V) = svd(X);
In [13]:
k = 8
n = size(X,1)
m = size(X,2)

# Inverse of the vector operator
function ivec(z)
    D = reshape(z[1:n*k], n,k)
    E = reshape(z[n*k + 1:end], m,k)
    return (D,E)
end

T = Tridiagonal(-ones(n-1), 2*ones(n), -ones(n-1))

# Gradient of objective function 
function f(z, arg = 0, α = 1., β = 0.1)
    (D,E) = ivec(z)
    R = X - D*E'
    fx = (vecnorm(R)^2)/2
    ∇gD = R*E
    ∇gE = R'*D
    return (fx, -[∇gD[:] - α*D[:] - β*(T*D)[:]; ∇gE[:] - α*E[:]])
end;
In [14]:
prox = (x) -> [max(0, x[1:n*k]); max(0,x[n*k+1:end])]

x = 2*randn((n+m)*k)
y = zeros((n+m)*k)
t = 0.0001;
CONV = []

for i = 1:200*1
    (fy, ∇fy) = f(y)
    x = prox(x - t*∇fy)
    y = 0.95*y + x
    push!(CONV, fy)
end
In [16]:
figure(figsize = (5,2))
plot(log(10,map(Float64,CONV)));
In [17]:
(D,E) = ivec(y)

figure(figsize = (10,1.0*k))

for i = 1:k
    subplot(k,2,i)
    plot(D[:,i],"k")   
    #title("$(round(norm(D[:,i]),2))")
    tick_params(axis="x",bottom="off",top="off",labelbottom="off") 
    tick_params(axis="y",right="off",left="off",labelleft="off") 
end
In [ ]:
writedlm("../data/Dstar", D)
writedlm("../data/Estar", E)

Every run, admitabily, comes out a little different. Fact is, I spent days in the addicting task of tweaking parameters and re-running the algorithm to make sure things were just right. There is something addictive about babying gradient descent, watching the objective go down.

But here's something I prepared earlier.

In [21]:
D = readdlm("../data/Dstar")
E = readdlm("../data/Estar");
In [34]:
(D,E) = ivec(y)
w = sortperm(sum(abs(D*E' - X),1)[:])
i = size(X,2)-20

fig = figure(figsize=(6, 3))
plot(D*E[w[i],:]',"k")
plot(X[:,w[i]],"r")
title(Words[w[i]]);
In [70]:
figure(figsize = (12,20*5))
for d_elem = 1:8
    subplot(1,8,d_elem)
    w = flipdim(sortperm(EÌ‚[:,d_elem]),1)
    imshow(X[:,w][:,1:10000]')
    tick_params(axis="x",bottom="off",top="off",labelbottom="off") 
    tick_params(axis="y",right="off",left="off",labelleft="off") 
end
error in running finalizer: Base.ArgumentError(msg="ref of NULL PyObject")
error in running finalizer: Base.ArgumentError(msg="ref of NULL PyObject")
error in running finalizer: Base.ArgumentError(msg="ref of NULL PyObject")
In [5]:
using PyPlot

EÌ‚ = E./(sum(E,1))

w = flipdim(sortperm(E[:,4]),1)
figure(figsize = (6,50))

ln = nothing

t = 0.4
Re = t*(D*E') + (1-t)*X

for i = 1:150
    ln = 1.3*(Re[1:108,w[i]] - minimum(X[:,w[i]]))
    ln = 250*(ln./norm(ln,1))
    plot(ln  - 0.4*i,"k",linewidth = 0.1)
    scatter(collect(0:107), 
            ln - 0.4*i, 
            abs(ln).^(1.5)/2, 
            "k", 
            edgecolors="none")
end

tick_params(axis="x",bottom="off",top="off",labelbottom="off") 
tick_params(axis="y",right="off",left="off",labelleft="off") 
axis((-13.0,120.0,-500.0,16))
Out[5]:
(-13.0,120.0,-500.0,16)
In [2]:
HTML(readall(open("style.css")))
Out[2]: